In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
#load the files
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
data = pd.concat([train, test]).reset_index(drop=True)
#size of training dataset
train_samples = train.shape[0]
#print some of them
data.head()
Out[1]:
In [2]:
#show the data types
data.info()
In [3]:
#heatmap
#fig = plt.figure(figsize=(9, 7))
sns.heatmap(data.corr(), annot=True);
At first sight we see the most correlated features with 'Survived' are 'Pclass' and 'Fare'.
In [4]:
data.groupby(data.Survived).Survived.count()
Out[4]:
There are more non-survivors than survivors so we will have to deal with it.
In [5]:
#Dropping useless features
data = data.drop(['Name', 'PassengerId', 'Ticket'], axis=1)
data.head(10)
Out[5]:
The cabin is formed by a letter with numbers and some cases there are more than one. So we can extract the letter to know the importance and localization of the passenger inside the ship. Here you have the real deck distribution.
Empty values will be set to zero.
In [6]:
data.Cabin.head(5)
Out[6]:
In [7]:
# Cabin
data.Cabin.fillna('0', inplace=True)
data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
In [8]:
# Change the type from object to int
data.Cabin = pd.to_numeric(data.Cabin)
Now we can show how this feature is related to the survival in the train dataset.
In [9]:
data[data.Survived==1].groupby(data.Cabin).Cabin.hist();
In [10]:
data[data.Survived==0].groupby(data.Cabin).Cabin.hist();
In [11]:
data[data.Cabin==0].Cabin.count()/data.Cabin.count()
Out[11]:
Another problem, the 77% of the cabins were unknown.. But seeing the previous graphs we can assume that is likely the passenger dies if the cabin is unknown.
So we can transform the Cabin to a binary classification.
In [12]:
data.loc[data.Cabin != 0, 'Cabin'] = 1
In [13]:
sns.heatmap(data.corr(), annot=True);
In [14]:
g = sns.factorplot(x="Embarked",y="Survived",kind="bar",data=data)
In [15]:
# Embarked
data.loc[data.Embarked == 'C', 'Embarked'] = 1
data.loc[data.Embarked == 'Q', 'Embarked'] = 2
data.loc[data.Embarked == 'S', 'Embarked'] = 3
In [16]:
data[data.Embarked.isna()]
Out[16]:
In [17]:
# They are females of the first class and survivors -> assumption: they embarked in C
data.Embarked.fillna(1, inplace=True)
Obviously the family size is quite important because it can affect directly if a person gets survived or not. The dataset has two features related to this meaning, 'SibSp' and 'Parch'. The first one is 'siblings and spouse' and the second one means 'parents and child'. Both features are defined as numbers so they can be transformed to a new one which represents the size of the family. It's only necessary to sum both of them and asign the correspondent size. It's going to be taken the following sizes: 'alone', 'medium' (1-4) and 'large' (more than 4).
After that, the original features will be deleted.
In [18]:
#New column to know if the passenger has family on board
def family(size):
if size == 0:
return "alone"
elif size < 5:
return "medium"
else:
return "large"
data['FamilySize'] = (data['SibSp']+data['Parch']).apply(family)
data = data.drop(['SibSp', 'Parch'], axis=1)
data.head()
Out[18]:
In [19]:
cond = (data['Sex']=='female') & (data['Pclass']==3)
data.groupby(['Survived','Sex','Pclass'])['Age'].mean()
data[cond].groupby(['Sex','Pclass'])['Age'].mean()
Out[19]:
In the case of age is not that easy. There could be a little correlation between the class where the passenger is traveling, the gender of it and if the passenger has survived or not. So it's a big deal to take into account these three features to calculate the most likely age.
In [20]:
def getAge(row):
surv = row.Survived
sex = row.Sex
pclass = row.Pclass
if surv==0 or surv==1:
condition = (data['Survived']==surv) & (data['Sex']==sex) & (data['Pclass']==pclass)
df_mean = data[condition].groupby(['Survived','Sex','Pclass'])['Age'].mean()
else:
condition = (data['Sex']==sex) & (data['Pclass']==pclass)
df_mean = data[condition].groupby(['Sex','Pclass'])['Age'].mean()
#print("surv: {}, sex: {}, class: {} -> age (mean): {}".format(surv, sex, pclass, df_mean.mean()))
return df_mean.mean()
data['Age'] = data['Age'].fillna(data.apply(getAge, axis=1))
In [21]:
len(data['Age']) - data['Age'].count()
Out[21]:
In [22]:
data.Fare.hist(bins=50)
Out[22]:
In [23]:
d1 = data["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
In [24]:
d1.hist(bins=50)
Out[24]:
In [25]:
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
In [26]:
#define age by ranges
def getAgeRange(age):
if age < 5:
return "child"
elif age < 20:
return "young"
elif age < 50:
return "adult"
else:
return "old"
data['Age'] = data['Age'].apply(getAgeRange)
In [27]:
# Age distribution
data.groupby(['Age'])['Survived'].describe()
Out[27]:
In [28]:
#Transform categorical to dummies
data = pd.get_dummies(data)
In [29]:
from sklearn.covariance import EllipticEnvelope
def idxAnomalies(X):
ee = EllipticEnvelope(contamination=0.05,
assume_centered=True,
random_state=13)
ee.fit(X)
pred = ee.predict(X)
return [index[0] for index, x in np.ndenumerate(pred) if x != 1]
In [30]:
#finding NaN
data.columns[data.isnull().any()].tolist()
Out[30]:
Probably the range of numerical features are not the same and this could produce problems in our results. A proper way to get good calculations is normalizing all the features.
In this case we're going to use the minmax scaler because it transforms the range of the features to values between 0 to 1.
In [31]:
data.describe().T
Out[31]:
Avoiding the dummy features, we see 'pclass' is not moving between a range of (0,1).
In [32]:
import matplotlib.pyplot as plt
data.head(15).plot()
plt.show()
After the scaler processing we'll have all the features in the same range. This speeds up the calculations because all are small numbers which are easy to use (better performance).
The picture below shows the features ranges.
In [33]:
#Squeeze the data to [0,1]
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
data[['Pclass']] = scaler.fit_transform(data[['Pclass']])
data[['Fare']] = scaler.fit_transform(data[['Fare']])
data[['Cabin']] = scaler.fit_transform(data[['Cabin']])
data[['Embarked']] = scaler.fit_transform(data[['Embarked']])
print("Train shape: {}".format(data.shape))
data.head(15).plot()
plt.show()
data.describe().T
Out[33]:
As a good practive, we're going to split the data into two different datasets, training and testing. Taking the number of training samples (saved in the beginning) we are able to split it.
Besides, we'll use the k-fold method to get different batches of the data (it's configured with 3 splits).
In [34]:
from sklearn.model_selection import StratifiedKFold
y = np.array(data['Survived'])
X = np.array(data.drop('Survived', axis=1))
#split by idx
idx = train_samples
X_train, X_test = X[:idx], X[idx:]
y_train, y_test = y[:idx], y[idx:]
# Remove anomalies only in train dataset
idx_anomalies = idxAnomalies(X_train)
X_train = np.delete(X_train, idx_anomalies, axis=0)
y_train = np.delete(y_train, idx_anomalies, axis=0)
print("Shape train: {}".format(X_train.shape))
print("Shape test: {}".format(X_test.shape))
#print(y_train[0:1])
#print(X_train[0:1].tolist())
kf = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
print(kf)
In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)
rf = RandomForestClassifier(random_state=42)
rf_param_grid = {"max_depth": [3, 5, 8],
"min_samples_split": [3, 10, 20],
"min_samples_leaf": [3, 10, 20],
"n_estimators" :[10, 20, 30, 40, 50]}
gsrf = GridSearchCV(rf, param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1)
gsrf.fit(X_train, y_train)
# Best score
gsrf.best_score_
Out[35]:
In [36]:
brf = gsrf.best_estimator_
gsrf.best_params_
Out[36]:
In [63]:
from sklearn.svm import SVC
svc = SVC(random_state=42, probability=True)
Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gammas = [0.0001, 0.001, 0.01, 0.1, 1]
svc_param_grid = [{'kernel': ['rbf'], 'gamma': gammas, 'C': Cs}]
gssvc = GridSearchCV(svc, param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs=4, verbose=1)
gssvc.fit(X_train, y_train)
# Best score
gssvc.best_score_
Out[63]:
In [64]:
bsvc = gssvc.best_estimator_
gssvc.best_params_
Out[64]:
In [65]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
eclf = VotingClassifier(estimators=[('rf', brf), ('svc', bsvc)], voting='soft')
epoch = 1
for train_idx, val_idx in kf.split(X_train, y_train):
X_t, X_v = X_train[train_idx], X_train[val_idx]
y_t, y_v = y_train[train_idx], y_train[val_idx]
epoch += 1
eclf.fit(X_t, y_t)
scores = cross_val_score(eclf, X_v, y_v)
print("Scores({}): {}".format(epoch-1, scores))
In [66]:
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
#
plot_learning_curve(eclf, "Voting Ensemble", X_train, y_train, cv=kfold, n_jobs=4)
plot_learning_curve(brf, "Random Forest", X_train, y_train, cv=kfold, n_jobs=4)
plot_learning_curve(bsvc, "SVC", X_train, y_train, cv=kfold, n_jobs=4)
Out[66]:
Finally, we only need to print the classification report and the confusion matrix and see the outcome.
In [71]:
from sklearn.metrics import classification_report,confusion_matrix
predictions = eclf.predict(X_train)
cm = confusion_matrix(y_train, predictions)
print(cm)
plt.matshow(cm)
plt.colorbar()
ax = plt.gca()
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
plt.show()
In [72]:
print(classification_report(y_train, predictions))
Seeing the ROC curve and the AUC we can evaluate if the model is a good classifier.
In [73]:
from sklearn.metrics import roc_curve, auc
# calculate the fpr and tpr for all thresholds of the classification
def plot_roc_curve(y_test, preds):
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'best')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.title('Receiver Operating Characteristic')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [74]:
plot_roc_curve(y_train, predictions)
In [43]:
import os
predictions = gsrf.best_estimator_.predict(X_test)
passengerId = 892
file = "PassengerId,Survived" + os.linesep
for i in range(len(X_test)):
file += "{},{}".format(passengerId, (int)(predictions[i])) + os.linesep
passengerId += 1
In [44]:
# Save to file
with open('attempt.txt', 'w') as f:
f.write(file)